In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

knitr::opts_chunk$set(echo = TRUE)

This file records the calculation metrics for the manuscript "Reliable biodiversity metrics from co-occurence based post-clustering curation of amplicon data".

For each table the following metrics are calculated:
(a) Linear regression of OTU richness vs Plant richness for the 130 samples (including r^2^ value),
(b) Number of OTUs,
(c) Taxonomic redundancy,
(d) Betadiversity (species/OTU turnover between sites), and
(e) Community dissimilarity at genus level between plant inventory and taxonomic composition of OTUs.

This step should be carried out after the LULU curation of the OTU tables documented in the file: E_Taxonomic_filtering.Rmd
NB: All markdown chuncks are set to "eval=FALSE". Change these accordingly. Also code blocks to be run outside R, has been #'ed out. Change this accordingly.

Bioinformatic tools necessary

Make sure that you have the following bioinformatic tools in your PATH
R packages: stringr, dplyr, tidyr

Analysis files

This step is dependent on the presence of OTU tables (un-curated tables and the corresponding tables curated with LULU)

Setting directories and libraries etc

setwd("~/analyses")
main_path <- getwd()
path <- file.path(main_path, "otutables_processing")
library(stringr)
library(dplyr)
library(tidyr)
require(vegan)

For each of the OTU tables: Calculating the OTU richness plot wise. For each method/table calculate taxonomic redundancy, total OTU count, Number of unique taxonomic names, Number of taxonomic names which are also present in the observational data

allFiles <- list.files(path)
all_plTabs <- allFiles[grepl("planttable$", allFiles)]
all_prTabs <- allFiles[grepl("planttable.luluprocessed$", allFiles)]
all_Tabs <-  c(all_plTabs,all_prTabs)
read_tabs <- file.path(path, all_Tabs)
# Vector for filtering, etc. at this step redundant, but included for safety
samples <- c("S001","S002","S003","S004","S005","S006","S007","S008","S067",
             "S009","S010","S011","S012","S013","S014","S040","S068","S015",
             "S016","S017","S018","S069","S070","S019","S020","S021","S022",
             "S024","S025","S026","S027","S041","S028","S029","S030","S032",
             "S033","S034","S035","S042","S036","S037","S038","S039","S086",
             "S087","S088","S089","S044","S071","S045","S046","S047","S048",
             "S049","S050","S051","S052","S053","S055","S056","S057","S058",
             "S090","S059","S060","S061","S062","S063","S064","S065","S066",
             "S072","S073","S074","S075","S076","S077","S078","S091","S079",
             "S080","S081","S082","S083","S084","S085","S092","S094","S095",
             "S096","S097","S098","S099","S100","S101","S102","S103","S104",
             "S106","S107","S108","S109","S133","S110","S111","S112","S113",
             "S114","S115","S116","S117","S118","S119","S120","S121","S122",
             "S123","S124","S134","S125","S126","S127","S129","S130","S131",
             "S132","S135","S136","S137")  

tab_name <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt")
otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)

tab_name <- file.path(main_path,"Table_plants_2014_cleaned.txt")
Plant_data2014 <- read.table(tab_name, sep="\t", row.names = 1, header=TRUE,
                             as.is=TRUE)

Plant_richness <- colSums(Plant_data2014)
otu_richness <- data.frame(matrix(NA, nrow = 130, ncol = length(all_Tabs)))
names(otu_richness) <- all_Tabs
rel_redundancy <- vector()
total_otu <- vector()
mean_pident <- vector()
corcoeffs <- vector()
betadiversity <- vector()

##inserted
lm_intercept <- vector()
lm_slope <- vector()
read_sum <- vector()
Num_otu_taxa_method <- vector()
otu_taxa_method <- list()
singleton_share  <- vector()
doubleton_share  <- vector()
ab_diss <- list()
pa_diss <- list()
##inserted until here

for(i in seq_along(read_tabs)) {
  tab <- read.csv(read_tabs[i],sep='\t',header=T,as.is=TRUE,row.names = 1) #read table
  tab <- tab[,samples] # order samples
  otu_richness[,i] = colSums(tab>0) # calculate plot wise richness
  amp_index <- row.names(tab) #OTU id's of current table
  reftaxindex <- which(otutaxonomy$qseqid %in% amp_index) # index of which OTUs are present in the current table

  ##inserted
  perfect_match_index <- which(otutaxonomy$pident == 100 & otutaxonomy$qseqid %in% amp_index) # index of which OTUs are present in the current
  otu_taxa_method[[i]] <- names(table(otutaxonomy$species[perfect_match_index])) #Which species names have been identified in the current table
  Num_otu_taxa_method[i] <- length(otu_taxa_method[[i]]) # Number of plant species names
  ## until here

  mean_pident[[i]] <- mean(otutaxonomy$pident[reftaxindex]) # average genbank match %

  spec <- otutaxonomy$species[reftaxindex] # names of all OTUs
  redundancy <- sum((table(spec) -1)) # count of taxonomically redundant OTUs
  total_otu[i] <- nrow(tab)   #total number of OTUs present in the table
  betadiversity[i] <- total_otu[i]/mean(otu_richness[,i])
  rel_redundancy[i] <- redundancy/total_otu[i] #  relative redundancy
  # R^2 of linear regression of OTU richness vs plant richness
  corcoeffs[i] <- (cor(Plant_richness,otu_richness[,i]))^2
  lm_fit <- lm(otu_richness[,i]~ Plant_richness)
  lm_intercept[i] <- lm_fit$coefficients[1]
  lm_slope[i] <- lm_fit$coefficients[2]
  read_sum[i] <- sum(tab)

  #Inserted. community dissimilarity
  stable <- tab 
  trans_table <- t(stable)
  rowindex <- rowSums(trans_table) != 0
  trans_table <- trans_table[rowindex,]
  stand_table <- decostand(trans_table, "hellinger")
  ab_diss[[i]] <- vegdist(stand_table, method="bray", binary=FALSE)
  pa_diss[[i]] <- vegdist(stand_table, method="bray", binary=TRUE)
  #inserted until here

  #inserted
  tab2 <- tab
  tab2[tab2>1] <- 1
  singleton_share[i] <- sum(rowSums(tab2)==1)/total_otu[i]
  doubleton_share[i] <- sum(rowSums(tab2)==2)/total_otu[i]

}

## Inserted 
#MANTEL test for curation effect on both presence absence (pa) tables and abundance tables (ab)
p_table <- Plant_data2014
names(p_table) <- samples
trans_p_table <- t(p_table)
rowindex <- rowSums(trans_p_table) != 0
trans_p_table <- trans_p_table[rowindex,]
plant_pa_diss <- vegdist(trans_p_table, method="bray", binary=TRUE)

site_names <- names(plant_table)

pa_curation <- list() # Manteltest for curation effect on pa data
ab_curation <- list() # Manteltest for curation effect on ab data
pa_statistic <- vector() # Mantel statistic r (pa)
pa_signif <- vector() # significance level (pa)
ab_statistic <- vector() # Mantel statistic r (ab)
ab_signif <- vector() # significance level (ab)
for(i in 1:(length(read_tabs)/2)) {
 pa_curation[[i]] <- mantel(pa_diss[[i]], pa_diss[[i+20]], method="pearson", permutations=999)
 pa_statistic[i] <- pa_curation[[i]]$statistic
 pa_signif[i] <- pa_curation[[i]]$signif

 ab_curation[[i]] <- mantel(ab_diss[[i]], ab_diss[[i+20]], method="pearson", permutations=999)
 ab_statistic[i] <- ab_curation[[i]]$statistic
 ab_signif[i] <- ab_curation[[i]]$signif
}

#MANTEL test for correlation with plant data on both presence absence (pa) tables and abundance tables (ab)
pa_vs_plant <- list() # Manteltest for plant data vd sequence data (pa)
ab_vs_plant <- list() # Manteltest for plant data vd sequence data (ab)
pa_vs_plant_statistic <- vector()  # Mantel statistic r (pa)
pa_vs_plant_signif <- vector() # significance level (pa)
ab_vs_plant_statistic <- vector() # Mantel statistic r (ab)
ab_vs_plant_signif <- vector() # significance level (ab)

for(i in 1:(length(read_tabs))) {
 pa_vs_plant[[i]] <- mantel(plant_pa_diss, pa_diss[[i]], method="pearson", permutations=999)
 pa_vs_plant_statistic[i] <- pa_vs_plant[[i]]$statistic
 pa_vs_plant_signif[i] <- pa_vs_plant[[i]]$signif
 ab_vs_plant[[i]] <- mantel(plant_pa_diss, ab_diss[[i]], method="pearson", permutations=999)
 ab_vs_plant_statistic[i] <- ab_vs_plant[[i]]$statistic
 ab_vs_plant_signif[i] <- ab_vs_plant[[i]]$signif
}
##Inserted until here

#Synchronize names for methods, levels and curation state and 
#   collect table statistics in one table
method <- str_split_fixed(all_Tabs, "_", 3)[,1]
method[method == "DADA2"] <- "DADA2(+VS)"
method[method == "DADA2VSEARCH"] <- "DADA2(+VS)"
level <- str_split_fixed(all_Tabs, "_", 3)[,2]
level <- gsub(".planttable","",level)
level[level == "0.95"] <- "95"
level[level == "0.96"] <- "96"
level[level == "0.97"] <- "97"
level[level == "0.98"] <- "98"
level[level == "0.985"] <- "98.5"
level[level == "NO"] <- "99/100"
level[level == "3"] <- "99/100"
level[level == "5"] <- "98.5"
level[level == "7"] <- "98"
level[level == "10"] <- "97"
level[level == "13"] <- "96"
level[level == "15"] <- "95"
level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95"))
#identify LULU curated tables
processed <- str_split_fixed(all_Tabs, "_", 3)[,3]
luluindex <- which(processed == "luluprocessed")
processed[luluindex] <- "curated"
processed[-luluindex] <- "raw"

#Merge all results in one table
method_statistics <- data.frame(Method=method,Level=level,
                                Curated=processed,Correlation=corcoeffs,
                                Redundancy=rel_redundancy,OTU_count=total_otu,
                                Mean_match=mean_pident,Beta=betadiversity,
                                Intercept = lm_intercept, Slope=lm_slope,
                                Total_readcount = read_sum, Taxa=Num_otu_taxa_method, 
                                Singleton=singleton_share,Doubleton=doubleton_share,
                                Com_dissim_PA_stat=pa_vs_plant_statistic,
                                Com_dissim_PA_sig=pa_vs_plant_signif,
                                Com_dissim_AB_stat=ab_vs_plant_statistic,
                                Com_dissim_AB_sig=ab_vs_plant_signif)

tab_name <- file.path(main_path,"Table_method_statistics_updated_revision1.txt")
{write.table(method_statistics, tab_name, sep="\t",quote=FALSE, col.names = NA)}

#Merge manteltests on curation effect in one table
mantel_tests_curation_effet <- data.frame(Method=method[1:20], Level=level[1:20], 
                                          PA_curation = pa_statistic, PA_significance=pa_signif, 
                                          AB_curation=ab_statistic, AB_significance=ab_signif)
tab_name <- file.path(main_path,"Table_manteltests.txt")
{write.table(mantel_tests_curation_effet, tab_name, sep="\t",quote=FALSE, col.names = NA)}

Construct a full plant richness vs OTU richness table and synchronize names for methods, levels and curation state

# add Plant richness to OTU richness dataframe
richness_incl_obs <- cbind(Obs_richness=Plant_richness,otu_richness) 
total_richness_df <- gather(richness_incl_obs, key=Method, 
                            value=OTU_richness,-1)

method <- str_split_fixed(total_richness_df$Method, "_", 3)[,1]
method[method == "DADA2"] <- "DADA2(+VS)"
method[method == "DADA2VSEARCH"] <- "DADA2(+VS)"
level <- str_split_fixed(total_richness_df$Method, "_", 3)[,2]
level <- gsub(".planttable","",level)
level[level == "0.95"] <- "95"
level[level == "0.96"] <- "96"
level[level == "0.97"] <- "97"
level[level == "0.98"] <- "98"
level[level == "0.985"] <- "98.5"
level[level == "NO"] <- "99/100"
level[level == "3"] <- "99/100"
level[level == "5"] <- "98.5"
level[level == "7"] <- "98"
level[level == "10"] <- "97"
level[level == "13"] <- "96"
level[level == "15"] <- "95"
level[level == "100"] <- "99"
level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95"))
processed <- str_split_fixed(total_richness_df$Method, "_", 3)[,3]
luluindex <- which(processed == "luluprocessed")
processed[luluindex] <- "curated"
processed[-luluindex] <- "raw"
total_richness_df2 <- data.frame(Method=method,Level=level,Curated=processed,
                                 Obs=total_richness_df$Obs_richness,
                                 OTU=total_richness_df$OTU_richness)

#save a long formatted table for ggplot
tab_name <- file.path(main_path,"Table_richness_calculations_long.txt")
{write.table(total_richness_df2, tab_name, sep="\t",quote=FALSE, col.names = NA)}

#save a wide formatted table for overview
tab_name <- file.path(main_path,"Table_richness_calculations_wide.txt")
{write.table(richness_incl_obs, tab_name, sep="\t",quote=FALSE, col.names = NA)}

Construct a table with the best match (%) for each OTU pr method/table separating retained and discarded OTUs.

allFiles <- list.files(path)
all_prTabs <- allFiles[grepl("LULU-", allFiles)]
prTab_names <- sort(as.vector(sapply(all_prTabs, 
                                     function(x) strsplit(x, "LULU-")[[1]][2])))

rds <- list()
read_tabs <- file.path(path, all_prTabs)

tab_name <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt")
otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)

retained_avg <- vector()
discarded_avg <- vector()
pident_frame <- data.frame()
for(i in seq_along(read_tabs)) {
  rds <- readRDS(read_tabs[i]) # read the saved rds LULU output
  retained_amplicons <- rds$cured_OTUs # extract the cured/retained OTUs
  discarded_amplicons <- rds$discarded_OTUs 
  number_observations <- length(retained_amplicons)+length(discarded_amplicons)
  retained_index <- which(otutaxonomy$qseqid %in% retained_amplicons)
  discarded_index <- which(otutaxonomy$qseqid %in% discarded_amplicons)
  retained_pident <- otutaxonomy$pident[retained_index]
  discarded_pident <- otutaxonomy$pident[discarded_index]
  method_pident <- rep(prTab_names[i],number_observations)
  retained_or_discarded <- c(rep("retained",length(retained_amplicons)),
                             rep("discarded",length(discarded_amplicons)))
  pident <- c(retained_pident,discarded_pident)
  current_pident_frame <- data.frame(method_pident,retained_or_discarded,pident)
  pident_frame <- rbind(pident_frame,current_pident_frame)
  # retained_avg[i] <- mean(retained_pident)
  # discarded_avg[i] <- mean(discarded_pident)
}

#Synchronize names for methods, levels and curation state
pident_frame$level_pident <- 
 str_split_fixed(as.character(pident_frame$method_pident), "_", 2)[,2]
pident_frame$method_pident <- 
 str_split_fixed(as.character(pident_frame$method_pident), "_", 2)[,1]
pident_frame$method_pident[pident_frame$method_pident == "DADA2"] <- 
 "DADA2(+VS)"
pident_frame$method_pident[pident_frame$method_pident == "DADA2VSEARCH"] <- 
 "DADA2(+VS)"
pident_frame$level_pident[pident_frame$level_pident == "0.95"] <- "95"
pident_frame$level_pident[pident_frame$level_pident == "0.96"] <- "96"
pident_frame$level_pident[pident_frame$level_pident == "0.97"] <- "97"
pident_frame$level_pident[pident_frame$level_pident == "0.98"] <- "98"
pident_frame$level_pident[pident_frame$level_pident == "0.985"] <- "98.5"
pident_frame$level_pident[pident_frame$level_pident == "NO"] <- "99/100"
pident_frame$level_pident[pident_frame$level_pident == "3"] <- "99/100"
pident_frame$level_pident[pident_frame$level_pident == "5"] <- "98.5"
pident_frame$level_pident[pident_frame$level_pident == "7"] <- "98"
pident_frame$level_pident[pident_frame$level_pident == "10"] <- "97"
pident_frame$level_pident[pident_frame$level_pident == "13"] <- "96"
pident_frame$level_pident[pident_frame$level_pident == "15"] <- "95"
pident_frame$level_pident <- 
 factor(pident_frame$level_pident,levels = c("99/100", "98.5", 
                                             "98", "97", "96", "95"))

tab_name <- file.path(main_path,"Table_OTU_match_rates.txt")
{write.table(pident_frame, tab_name, sep="\t",quote=FALSE, col.names = NA)}

Taxonomic dissimilarity
Constructing genus level OTU tables. Read counts are reduced to presence/absence, and observations are number of species in respective genera/families.

tab_name <- tbname <- file.path(main_path,"Table_plants_genus_2014_cleaned.txt")
genussummed_reference <- read.table(tab_name, sep="\t", row.names = 1, 
                                    header=TRUE, as.is=TRUE)

allFiles <- list.files(path)
all_plTabs <- allFiles[grepl("planttable$", allFiles)]
all_prTabs <- allFiles[grepl("planttable.luluprocessed$", allFiles)]
all_Tabs <-  c(all_plTabs,all_prTabs)
read_tabs <- file.path(path, all_Tabs)
proc_tabs <- file.path(path, paste0(all_Tabs,".for_visual_inspectionX.txt"))

tab_name <- tbname <- file.path(main_path,"sample_list.txt")
siteinfo <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)

tab_name <- tbname <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt")
otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)
row.names(otutaxonomy) <- otutaxonomy$qseqid
otutab_genussummed <- list()
otu_genera <- vector()
for(i in seq_along(read_tabs)) {
  tab <- read.csv(read_tabs[i],sep='\t',header=T,as.is=TRUE,row.names = 1)
  tab[tab>0] <- 1
  tabs <- tab[,siteinfo$Name]
  tabg <- tab[,siteinfo$Name]
  names(tabg) <- siteinfo$BW
  names(tabs) <- siteinfo$BW

  speciesnames <- otutaxonomy[row.names(tabs),"species"]
  tabs$species <- speciesnames
  tabg$genus <-  str_split_fixed(speciesnames, "_", 2)[,1]

  otutab_genussummed[[i]] <- tabg %>% group_by(genus) %>% summarise_each(funs(sum))
  otutab_genussummed[[i]] <- data.frame(otutab_genussummed[[i]])

  row.names(otutab_genussummed[[i]]) <- otutab_genussummed[[i]]$genus
  otutab_genussummed[[i]] <- otutab_genussummed[[i]][,-1]
  otu_genera <- c(otu_genera,tabg$genus)
  # output OTU tables with species names for visual inspection (remove #)
  #{write.table(tabs, proc_tabs[i], sep="\t",quote=FALSE, col.names = NA)} 
}

community_distance_table <- 
 data.frame(matrix(0, ncol = length(all_Tabs), nrow = 130, 
                   dimnames = list(siteinfo$BW,all_Tabs)))
for(i in seq_along(all_Tabs)) {
  otugenera_clean <- row.names(otutab_genussummed[[i]])
  otugenera_clean <- otugenera_clean[otugenera_clean != ""]
  otugenera_clean <- otugenera_clean[otugenera_clean != "x"]
  obsgenera_clean <- row.names(genussummed_reference)
  obsgenera_clean <- obsgenera_clean[obsgenera_clean != ""]
  obsgenera_clean <- obsgenera_clean[obsgenera_clean != "x"]
  commongenera <- names(table(c(otugenera_clean,obsgenera_clean)))
  otuframe <- 
   data.frame(matrix(0, nrow = length(commongenera), 
                     ncol = 130, dimnames = list(commongenera,siteinfo$BW)))
  otuframe[otugenera_clean,] <- otutab_genussummed[[i]][otugenera_clean,]

  obsframe <- 
   data.frame(matrix(0, nrow = length(commongenera),
                     ncol = 130, dimnames = list(commongenera,siteinfo$BW)))
  obsframe[obsgenera_clean,siteinfo$BW] <- 
   genussummed_reference[obsgenera_clean,siteinfo$BW]

  subtracted_frame <- abs(obsframe-otuframe)
  absolute_distance <- colSums(subtracted_frame)
  community_distance_table[,i] <- absolute_distance
}

community_distance_table$BW <- siteinfo$BW 

gathered_dissimilarity <- gather(community_distance_table,
                                 method,dissimilarity,1:40)

method <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,1]
method[method == "DADA2"] <- "DADA2(+VS)"
method[method == "DADA2VSEARCH"] <- "DADA2(+VS)"
level <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,2]
level <- gsub(".planttable","",level)
level[level == "0.95"] <- "95"
level[level == "0.96"] <- "96"
level[level == "0.97"] <- "97"
level[level == "0.98"] <- "98"
level[level == "0.985"] <- "98.5"
level[level == "NO"] <- "99/100"
level[level == "3"] <- "99/100"
level[level == "5"] <- "98.5"
level[level == "7"] <- "98"
level[level == "10"] <- "97"
level[level == "13"] <- "96"
level[level == "15"] <- "95"
level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95"))

processed <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,3]
luluindex <- which(processed == "luluprocessed")
processed[luluindex] <- "curated"
processed[-luluindex] <- "raw"

gathered_dissimilarity_processed <- 
 data.frame(Method=method,Level=level,Curated=processed,
            Site=gathered_dissimilarity$BW,
            Dissimilarity=gathered_dissimilarity$dissimilarity)

#save a table for ggplot (long format)
tab_name <- file.path(main_path,"Table_taxonomic_dissimilatiry_long.txt")
{write.table(gathered_dissimilarity_processed, tab_name, sep="\t",
             quote=FALSE, row.names = FALSE)}

#save a table for overview (wide format)
tab_name <- file.path(main_path,"Table_taxonomic_dissimilatiry_wide.txt")
{write.table(community_distance_table, tab_name, sep="\t",quote=FALSE, 
             col.names = NA)}

tobiasgf/lulu documentation built on Jan. 17, 2024, 3:57 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tobiasgf/lulu
Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

Bioinformatic tools necessary

Analysis files

R Package Documentation

Browse R Packages

We want your feedback!

tobiasgf/lulu Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

Bioinformatic tools necessary

Analysis files

R Package Documentation

Browse R Packages

We want your feedback!

tobiasgf/lulu
Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics